@inproceedings{cheng2016wide,
  title={Wide \& deep learning for recommender systems},
  author={Cheng, Heng-Tze and Koc, Levent and Harmsen, Jeremiah and Shaked, Tal and Chandra, Tushar and Aradhye, Hrishi and Anderson, Glen and Corrado, Greg and Chai, Wei and Ispir, Mustafa and others},
  booktitle={Proceedings of the 1st workshop on deep learning for recommender systems},
  pages={7--10},
  year={2016}
}

@inproceedings{yi2019sampling,
  title={Sampling-bias-corrected neural modeling for large corpus item recommendations},
  author={Yi, Xinyang and Yang, Ji and Hong, Lichan and Cheng, Derek Zhiyuan and Heldt, Lukasz and Kumthekar, Aditee and Zhao, Zhe and Wei, Li and Chi, Ed},
  booktitle={Proceedings of the 13th ACM Conference on Recommender Systems},
  pages={269--277},
  year={2019}
}

@inproceedings{ma2018entire,
  title={Entire space multi-task model: An effective approach for estimating post-click conversion rate},
  author={Ma, Xiao and Zhao, Liqin and Huang, Guan and Wang, Zhi and Hu, Zelin and Zhu, Xiaoqiang and Gai, Kun},
  booktitle={The 41st International ACM SIGIR Conference on Research \& Development in Information Retrieval},
  pages={1137--1140},
  year={2018}
}

@inproceedings{de2021transformers4rec,
  title={Transformers4rec: Bridging the gap between nlp and sequential/session-based recommendation},
  author={de Souza Pereira Moreira, Gabriel and Rabhi, Sara and Lee, Jeong Min and Ak, Ronay and Oldridge, Even},
  booktitle={Proceedings of the 15th ACM Conference on Recommender Systems},
  pages={143--153},
  year={2021}
}

@inproceedings {280902,
author = {Chijun Sima and Yao Fu and Man-Kit Sit and Liyi Guo and Xuri Gong and Feng Lin and Junyu Wu and Yongsheng Li and Haidong Rong and Pierre-Louis Aublin and Luo Mai},
title = {Ekko: A {Large-Scale} Deep Learning Recommender System with {Low-Latency} Model Update},
booktitle = {16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)},
year = {2022},
isbn = {978-1-939133-28-1},
address = {Carlsbad, CA},
pages = {821--839},
url = {https://www.usenix.org/conference/osdi22/presentation/sima},
publisher = {USENIX Association},
month = jul,
}

@inproceedings {186214,
author = {Mu Li and David G. Andersen and Jun Woo Park and Alexander J. Smola and Amr Ahmed and Vanja Josifovski and James Long and Eugene J. Shekita and Bor-Yiing Su},
title = {Scaling Distributed Machine Learning with the Parameter Server},
booktitle = {11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)},
year = {2014},
isbn = { 978-1-931971-16-4},
address = {Broomfield, CO},
pages = {583--598},
url = {https://www.usenix.org/conference/osdi14/technical-sessions/presentation/li_mu},
publisher = {USENIX Association},
month = oct,
}

@inproceedings {ProjectAdam_186212,
	author = {Trishul Chilimbi and Yutaka Suzue and Johnson Apacible and Karthik Kalyanaraman},
	title = {Project Adam: Building an Efficient and Scalable Deep Learning Training System},
	booktitle = {11th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 14)},
	year = {2014},
	isbn = { 978-1-931971-16-4},
	address = {Broomfield, CO},
	pages = {571--582},
	url = {https://www.usenix.org/conference/osdi14/technical-sessions/presentation/chilimbi},
	publisher = {{USENIX} Association},
	month = oct,
}

@InProceedings{ConciseVV_10.1007/11561927_25,
author="Malkhi, Dahlia
and Terry, Doug",
editor="Fraigniaud, Pierre",
title="Concise Version Vectors in WinFS",
booktitle="Distributed Computing",
year="2005",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="339--353",
abstract="Conflicts naturally arise in optimistically replicated systems. The common way to detect update conflicts is via version vectors, whose storage and communication overhead are number of replicas {\texttimes} number of objects. These costs may be prohibitive for large systems.",
isbn="978-3-540-32075-3"
}

@article{VectorSet_10.1145/1243418.1243427,
author = {Malkhi, Dahlia and Novik, Lev and Purcell, Chris},
title = {P2P Replica Synchronization with Vector Sets},
year = {2007},
issue_date = {April 2007},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {2},
issn = {0163-5980},
url = {https://doi.org/10.1145/1243418.1243427},
doi = {10.1145/1243418.1243427},
abstract = {},
journal = {SIGOPS Oper. Syst. Rev.},
month = apr,
pages = {68–74},
numpages = {7}
}

@article{russakovsky2015imagenet,
	title={{Imagenet Large Scale Visual Recognition Challenge}},
	author={Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael},
	journal={International Journal of Computer Vision (IJCV)},
	volume={115},
	number={3},
	pages={211--252},
	year={2015},
	publisher={Springer}
}

@article{merity2016pointer,
  title={Pointer sentinel mixture models},
  author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard},
  journal={arXiv preprint arXiv:1609.07843},
  year={2016}
}

@inproceedings{ijcai2017-239,
  author    = {Huifeng Guo and Ruiming TANG and Yunming Ye and Zhenguo Li and Xiuqiang He},
  title     = {DeepFM: A Factorization-Machine based Neural Network for CTR Prediction},
  booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on
               Artificial Intelligence, {IJCAI-17}},
  pages     = {1725--1731},
  year      = {2017},
  doi       = {10.24963/ijcai.2017/239},
  url       = {https://doi.org/10.24963/ijcai.2017/239},
}

@article{naumov2019deep,
  title={Deep learning recommendation model for personalization and recommendation systems},
  author={Naumov, Maxim and Mudigere, Dheevatsa and Shi, Hao-Jun Michael and Huang, Jianyu and Sundaraman, Narayanan and Park, Jongsoo and Wang, Xiaodong and Gupta, Udit and Wu, Carole-Jean and Azzolini, Alisson G and others},
  journal={arXiv preprint arXiv:1906.00091},
  year={2019}
}

@misc{Merlin,
    note={Accessed on 2022-03-24},
    author = {NVIDIA},
    year = {2022},
    title = {{{NVIDIA Merlin}}},
    howpublished = {\url{https://github.com/NVIDIA-Merlin/Merlin}},
}

@inproceedings{NIPS2015_86df7dcf,
 author = {Sculley, D. and Holt, Gary and Golovin, Daniel and Davydov, Eugene and Phillips, Todd and Ebner, Dietmar and Chaudhary, Vinay and Young, Michael and Crespo, Jean-Fran\c{c}ois and Dennison, Dan},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {Hidden Technical Debt in Machine Learning Systems},
 url = {https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf},
 volume = {28},
 year = {2015}
}

@misc{NVTabular,
    note={Accessed on 2022-03-24},
    author = {NVIDIA},
    year = {2022},
    title = {{{NVIDIA NVTabular}}},
    howpublished = {\url{https://github.com/NVIDIA-Merlin/NVTabular}},
}

@misc{HugeCTR,
    note={Accessed on 2022-03-24},
    author = {NVIDIA},
    year = {2022},
    title = {{{NVIDIA HugeCTR}}},
    howpublished = {\url{https://github.com/NVIDIA-Merlin/HugeCTR}},
}

@misc{Triton,
    note={Accessed on 2022-03-24},
    author = {NVIDIA},
    year = {2022},
    title = {{{NVIDIA Triton}}},
    howpublished = {\url{https://github.com/triton-inference-server/server}},
}

@article{zionex,
  title={Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models},
  author={Mudigere, Dheevatsa and Hao, Yuchen and Huang, Jianyu and Jia, Zhihao and Tulloch, Andrew and Sridharan, Srinivas and Liu, Xing and Ozdal, Mustafa and Nie, Jade and Park, Jongsoo and others},
  journal={arXiv preprint arXiv:2104.05158},
  year={2021}
}

@inproceedings{10.1145/3437801.3441578,
author = {Fang, Jiarui and Yu, Yang and Zhao, Chengduo and Zhou, Jie},
title = {TurboTransformers: An Efficient GPU Serving System for Transformer Models},
year = {2021},
isbn = {9781450382946},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3437801.3441578},
doi = {10.1145/3437801.3441578},
abstract = {The transformer is the most critical algorithm innovation of the Nature Language Processing (NLP) field in recent years. Unlike the Recurrent Neural Network (RNN) models, transformers are able to process on dimensions of sequence lengths in parallel, therefore leads to better accuracy on long sequences. However, efficient deployments of them for online services in data centers equipped with GPUs are not easy. First, more computation introduced by transformer structures makes it more challenging to meet the latency and throughput constraints of serving. Second, NLP tasks take in sentences of variable length. The variability of input dimensions brings a severe problem to efficient memory management and serving optimization.To solve the above challenges, this paper designed a transformer serving system called TurboTransformers, which consists of a computing runtime and a serving framework. Three innovative features make it stand out from other similar works. An efficient parallel algorithm is proposed for GPU-based batch reduction operations, like Softmax and LayerNorm, which are major hot spots besides BLAS routines. A memory allocation algorithm, which better balances the memory footprint and allocation/free efficiency, is designed for variable-length input situations. A serving framework equipped with a new batch scheduler using dynamic programming achieves the optimal throughput on variable-length requests. The system can achieve the state-of-the-art transformer model serving performance on GPU platforms and can be seamlessly integrated into your PyTorch code with a few lines of code.},
booktitle = {Proceedings of the 26th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming},
pages = {389–402},
numpages = {14},
keywords = {serving system, deep learning runtime, GPU, transformers},
location = {Virtual Event, Republic of Korea},
series = {PPoPP '21}
}

@inproceedings{wang-etal-2021-lightseq,
    title = "{L}ight{S}eq: A High Performance Inference Library for Transformers",
    author = "Wang, Xiaohui  and
      Xiong, Ying  and
      Wei, Yang  and
      Wang, Mingxuan  and
      Li, Lei",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-industry.15",
    doi = "10.18653/v1/2021.naacl-industry.15",
    pages = "113--120",
    abstract = "Transformer and its variants have achieved great success in natural language processing. Since Transformer models are huge in size, serving these models is a challenge for real industrial applications. In this paper, we propose , a highly efficient inference library for models in the Transformer family. includes a series of GPU optimization techniques to both streamline the computation of Transformer layers and reduce memory footprint. supports models trained using PyTorch and Tensorflow. Experimental results on standard machine translation benchmarks show that achieves up to 14x speedup compared with TensorFlow and 1.4x speedup compared with , a concurrent CUDA implementation. The code will be released publicly after the review.",
}

@inproceedings{MLSYS2021_979d472a,
 author = {Yin, Chunxing and Acun, Bilge and Wu, Carole-Jean and Liu, Xing},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {A. Smola and A. Dimakis and I. Stoica},
 pages = {448--462},
 title = {TT-Rec: Tensor Train Compression for Deep Learning Recommendation Models},
 url = {https://proceedings.mlsys.org/paper/2021/file/979d472a84804b9f647bc185a877a8b5-Paper.pdf},
 volume = {3},
 year = {2021}
}

@inproceedings{MLSYS2020_f7e6c855,
 author = {Zhao, Weijie and Xie, Deping and Jia, Ronglai and Qian, Yulei and Ding, Ruiquan and Sun, Mingming and Li, Ping},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {I. Dhillon and D. Papailiopoulos and V. Sze},
 pages = {412--428},
 title = {Distributed Hierarchical GPU Parameter Server for Massive Scale Deep Learning Ads Systems},
 url = {https://proceedings.mlsys.org/paper/2020/file/f7e6c85504ce6e82442c770f7c8606f0-Paper.pdf},
 volume = {2},
 year = {2020}
}

@inproceedings{10.1145/2020408.2020444,
author = {Chu, Wei and Zinkevich, Martin and Li, Lihong and Thomas, Achint and Tseng, Belle},
title = {Unbiased Online Active Learning in Data Streams},
year = {2011},
isbn = {9781450308137},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2020408.2020444},
doi = {10.1145/2020408.2020444},
abstract = {Unlabeled samples can be intelligently selected for labeling to minimize classification error. In many real-world applications, a large number of unlabeled samples arrive in a streaming manner, making it impossible to maintain all the data in a candidate pool. In this work, we focus on binary classification problems and study selective labeling in data streams where a decision is required on each sample sequentially. We consider the unbiasedness property in the sampling process, and design optimal instrumental distributions to minimize the variance in the stochastic process. Meanwhile, Bayesian linear classifiers with weighted maximum likelihood are optimized online to estimate parameters. In empirical evaluation, we collect a data stream of user-generated comments on a commercial news portal in 30 consecutive days, and carry out offline evaluation to compare various sampling strategies, including unbiased active learning, biased variants, and random sampling. Experimental results verify the usefulness of online active learning, especially in the non-stationary situation with concept drift.},
booktitle = {Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {195–203},
numpages = {9},
keywords = {unbiasedness, bayesian online learning, active learning, data streaming, adaptive importance sampling},
location = {San Diego, California, USA},
series = {KDD '11}
}

@inproceedings{10.1145/2648584.2648589,
author = {He, Xinran and Pan, Junfeng and Jin, Ou and Xu, Tianbing and Liu, Bo and Xu, Tao and Shi, Yanxin and Atallah, Antoine and Herbrich, Ralf and Bowers, Stuart and Candela, Joaquin Qui\~{n}onero},
title = {Practical Lessons from Predicting Clicks on Ads at Facebook},
year = {2014},
isbn = {9781450329996},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/2648584.2648589},
doi = {10.1145/2648584.2648589},
abstract = {Online advertising allows advertisers to only bid and pay for measurable user responses, such as clicks on ads. As a consequence, click prediction systems are central to most online advertising systems. With over 750 million daily active users and over 1 million active advertisers, predicting clicks on Facebook ads is a challenging machine learning task. In this paper we introduce a model which combines decision trees with logistic regression, outperforming either of these methods on its own by over 3%, an improvement with significant impact to the overall system performance. We then explore how a number of fundamental parameters impact the final prediction performance of our system. Not surprisingly, the most important thing is to have the right features: those capturing historical information about the user or ad dominate other types of features. Once we have the right features and the right model (decisions trees plus logistic regression), other factors play small roles (though even small improvements are important at scale). Picking the optimal handling for data freshness, learning rate schema and data sampling improve the model slightly, though much less than adding a high-value feature, or picking the right model to begin with.},
booktitle = {Proceedings of the Eighth International Workshop on Data Mining for Online Advertising},
pages = {1–9},
numpages = {9},
location = {New York, NY, USA},
series = {ADKDD'14}
}

@inproceedings{10.1145/3267809.3267817,
author = {Tian, Huangshi and Yu, Minchen and Wang, Wei},
title = {Continuum: A Platform for Cost-Aware, Low-Latency Continual Learning},
year = {2018},
isbn = {9781450360111},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3267809.3267817},
doi = {10.1145/3267809.3267817},
abstract = {Many machine learning applications operate in dynamic environments that change over time, in which models must be continually updated to capture the recent trend in data. However, most of today's learning frameworks perform training offline, without a system support for continual model updating.In this paper, we design and implement Continuum, a general-purpose platform that streamlines the implementation and deployment of continual model updating across existing learning frameworks. In pursuit of fast data incorporation, we further propose two update policies, cost-aware and best-effort, that judiciously determine when to perform model updating, with and without accounting for the training cost (machine-time), respectively. Theoretical analysis shows that cost-aware policy is 2-competitive. We implement both polices in Continuum, and evaluate their performance through EC2 deployment and trace-driven simulations. The evaluation shows that Continuum results in reduced data incorporation latency, lower training cost, and improved model quality in a number of popular online learning applications that span multiple application domains, programming languages, and frameworks.},
booktitle = {Proceedings of the ACM Symposium on Cloud Computing},
pages = {26–40},
numpages = {15},
keywords = {Competitive Analysis, Continual Learning System, Online Algorithm},
location = {Carlsbad, CA, USA},
series = {SoCC '18}
}

@INPROCEEDINGS{9355295,  
  author={Xie, Minhui and Ren, Kai and Lu, Youyou and Yang, Guangxu and Xu, Qingxing and Wu, Bihai and Lin, Jiazhen and Ao, Hongbo and Xu, Wanhong and Shu, Jiwu},  
  booktitle={SC20: International Conference for High Performance Computing, Networking, Storage and Analysis},   
  title={Kraken: Memory-Efficient Continual Learning for Large-Scale Real-Time Recommendations},   
  year={2020},  
  volume={},  
  number={},  
  pages={1-17},  
  doi={10.1109/SC41405.2020.00025}
}

@inproceedings{gong2020edgerec,
  title={EdgeRec: Recommender System on Edge in Mobile Taobao},
  author={Gong, Yu and Jiang, Ziwen and Feng, Yufei and Hu, Binbin and Zhao, Kaiqi and Liu, Qingwen and Ou, Wenwu},
  booktitle={Proceedings of the 29th ACM International Conference on Information \& Knowledge Management},
  pages={2477--2484},
  year={2020}
}

@inproceedings{NEURIPS2020_a1d4c20b,
 author = {He, Chaoyang and Annavaram, Murali and Avestimehr, Salman},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 pages = {14068--14080},
 publisher = {Curran Associates, Inc.},
 title = {Group Knowledge Transfer: Federated Learning of Large CNNs at the Edge},
 url = {https://proceedings.neurips.cc/paper/2020/file/a1d4c20b182ad7137ab3606f0e3fc8a4-Paper.pdf},
 volume = {33},
 year = {2020}
}

@inproceedings{MLSYS2021_ec895663,
 author = {Jiang, Wenqi and He, Zhenhao and Zhang, Shuai and Preu\ss er, Thomas B. and Zeng, Kai and Feng, Liang and Zhang, Jiansong and Liu, Tongxuan and Li , Yong and Zhou, Jingren and Zhang, Ce and Alonso, Gustavo},
 booktitle = {Proceedings of Machine Learning and Systems},
 editor = {A. Smola and A. Dimakis and I. Stoica},
 pages = {845--859},
 title = {MicroRec: Efficient Recommendation Inference by Hardware and Data Structure Solutions},
 url = {https://proceedings.mlsys.org/paper/2021/file/ec8956637a99787bd197eacd77acce5e-Paper.pdf},
 volume = {3},
 year = {2021}
}

@inproceedings{10.1145/3394486.3403059,
author = {Shi, Hao-Jun Michael and Mudigere, Dheevatsa and Naumov, Maxim and Yang, Jiyan},
title = {Compositional Embeddings Using Complementary Partitions for Memory-Efficient Recommendation Systems},
year = {2020},
isbn = {9781450379984},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3394486.3403059},
doi = {10.1145/3394486.3403059},
abstract = {},
booktitle = {Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
pages = {165–175},
numpages = {11},
keywords = {model compression, recommendation systems, embeddings},
location = {Virtual Event, CA, USA},
series = {KDD '20}
}

@misc{ginart2021mixed,
      title={Mixed Dimension Embeddings with Application to Memory-Efficient Recommendation Systems}, 
      author={Antonio Ginart and Maxim Naumov and Dheevatsa Mudigere and Jiyan Yang and James Zou},
      year={2021},
      eprint={1909.11810},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}